/**********************************************

localsearch.cxx

Copyright 2001 by Adam Meyerson and Liadan O'Callaghan

This code is provided for examination by anyone who may be interested in 
implementing local search clustering algorithms;  it is not intended to be 
run, but to be used as a starting point for anyone who wants to write 
his or her own implementation. 

In particular: We, the authors, do not provide a license for the use
of this code. If you use this code instead of writing your own, we
don't want to know. We do not claim that it works, and will not
maintain it or provide support to those who use it. Use of this code
is at the risk of the user.

*****
Paper reference:

Streaming-Data Algorithms for High-Quality Clustering, 

by Liadan O'Callaghan, Nina Mishra, Adam Meyerson, 
Sudipto Guha, and Rajeev Motwani, 

ICDE 2002. 
*****

This local search code computes a continuous k-median clustering of a 
given point set, where k is within a user-specified range.

To execute:

localsearch k1 k2 n d infile samplesize outfile

k1 and k2 are lower and upper bounds on the number of medians you want 
to find.

n is the number of points in infile, the file of points to cluster.

d is the dimensionality of the points in infile, which will be taken as 
points in real space with the L2 norm.

If samplesize is set to some s > 0, a sample of s points will be taken 
uniformly at random with replacement from infile, and the clustering will 
be computed just on this sample.  Setting samplesize to 0 or less will 
result in a clustering based on all n points.

infile must contain one point per line, with each point given in the 
format:

w x1 x2 x3 ... xd

where w is the weight of the point and its d coordinates are x1 through 
xd.

outfile will be created if it does not exist, and overwritten otherwise.  
It will contain the medians found by localsearch, and it will be in the 
same format as infile: the weight of a median will correspond to the total 
weight of all points assigned to it.

There must be a file called 'seeds' in the directory in which localsearch 
is running; this file must contain an integer.  localsearch will use this 
integer to seed its random number generator, and will replace it with a 
new integer each time it is run.
***********************************/


#include<stdio.h>
#include<iostream>
#include<fstream>
#include<stdlib.h>
#include<sys/time.h>
#include<string.h>
#include<assert.h>
#include<math.h>
#include<sys/resource.h>
#include<limits.h>

#define MAXNAMESIZE 1024 // max filename length
#define SEED 1
/* increase this to reduce probability of random error */
/* increasing it also ups running time of "speedy" part of the code */
/* SP = 1 seems to be fine */
#define SP 1 // number of repetitions of speedy must be >=1

/* higher ITER --> more likely to get correct # of centers */
/* higher ITER also scales the running time almost linearly */
#define ITER 3 // iterate ITER* k log k times; ITER >= 1

using namespace std;
/* this structure represents a point */
/* these will be passed around to avoid copying coordinates */
typedef struct {
  float weight;
  float *coord;
  long assign;  /* number of point where this one is assigned */
  float cost;  /* cost of that assignment, weight*distance */
} Point;

/* this is the array of points */
typedef struct {
  long num; /* number of points; may not be N if this is a sample */
  int dim;  /* dimensionality */
  Point *p; /* the array itself */
} Points;


void sample(Points *points, Points *sampl, long size);
void sample2(Points *points, long size);

int isIdentical(float *i, float *j, int D)
  // tells whether two points of D dimensions are identical
{
  int a = 0;
  int equal = 1;

  while (equal && a < D) {
    if (i[a] != j[a]) equal = 0;
    else a++;
  }
  if (equal) return 1;
  else return 0;

}


void getSeed(int *s)
{
  /* read in random seed from file called "seeds" */
  FILE *fpSeeds;

  fpSeeds = fopen("seeds", "r");
  if (fpSeeds == NULL) {
    cerr << "CREATE A FILE CALLED seeds CONTAINING AN INTEGER!\n";
    exit(1);
  }
  fscanf(fpSeeds, "%d", s);
  fclose(fpSeeds);

  //  cout << "Using Random Seed: " << *s << endl;

  srand48(*s);

  fpSeeds = fopen("seeds", "w");
  fprintf(fpSeeds, "%d\n", lrand48());
  fclose(fpSeeds);
}



/* read the points from filename into the array points */
void readpoints(ifstream *ifile, Points *points)
{
  long i, ii;
  for(i=0;i<points->num;i++) {
    *ifile >> points->p[i].weight;
    for (ii=0;ii<points->dim;ii++)
	{
	  char temp[128];
	  memset(temp, 0, 128 * sizeof(char));
	  *ifile >> temp;
      points->p[i].coord[ii] = atof(temp);
	}
  }
}

/* read the command line to get kmin, kmax, point array */
int readcommands(int argc, char **argv, long *kmin, long *kmax,
		 Points *points, Points *sampl, long *samplesize,
		 char **outfile)
{
  long i;
  if (argc<9) {
    cerr << "usage: " << argv[0] << " k1 k2 n d infile samplesize outfile"
	 << endl;
    cerr << "where n = # of points, d = # of dimensions;" << endl;
    cerr << "(note k1 < k2)" << endl;
    cerr << "Sampling is with replacement whenever samplesize > 0." << endl;
    cerr << "If samplesize <= 0, algorithm is run on whole dataset." << endl;
    return(0);
  }
  
  return(1);
}

/* build a sample of given size, points drawn uniformly at random */
/* with replacement */

void sample(Points *points, Points *sampl, long size)
{
  long i, j;
  long randpoint;
  sampl->num = size;
  sampl->dim = points->dim;
  sampl->p = (Point *)malloc(sizeof(Point)*size);

  for( i = 0; i < size; i++ ) {
    sampl->p[i].coord = (float *)malloc(sampl->dim * sizeof(float));
  }

  for ( i = 0; i < size; i++ ) {
    // pick a point from points, uniformly at random
    randpoint = lrand48() % points->num;

    // copy that point into the ith location in the sample
    for ( j = 0; j < sampl->dim; j++ ) {
      sampl->p[i].coord[j] = points->p[randpoint].coord[j];
    }
    sampl->p[i].weight = points->p[randpoint].weight;
  }
  
  for ( i = 0; i < points->num; i++ ) {
    free(points->p[i].coord);
  }
  free(points->p);
  points->p = sampl->p;
  points->num = sampl->num;
}

/* comparator for floating point numbers */
static int floatcomp(const void *i, const void *j)
{
  float a, b;
  a = *(float *)(i);
  b = *(float *)(j);
  if (a > b) return (1);
  if (a < b) return (-1);
  return(0);
}

/* select a sample with total WEIGHT equal to size */
/* this is sampling of size points with replacement */
/* multiple selections increase the effective weight of a point */
/* thus, the total number of points in the sample is <= size */
/* this procedure will free old points structure while running */
void sample2(Points *points, long size)
{
  long i, ii, iii;
  float totalweight;
  float *weightarray;
  float mytotal;
  Points *sampl;

  sampl = (Points *)malloc(sizeof(Points));
  totalweight=0.0;
  weightarray = (float *)malloc(sizeof(float)*size);
  sampl->p=(Point *)malloc(sizeof(Point)*size);
  for (i=0;i<points->num;i++)
    totalweight += points->p[i].weight;
  /* determine random weights between 0 and totalweight */
  for (i=0;i<size;i++) {
    weightarray[i] = lrand48()/(float)INT_MAX;
    weightarray[i] *= totalweight;
    if (weightarray[i] > totalweight)
      fprintf(stderr, "Random Number Error!\n");
  }
  /* sort these weights from smallest to largest */
  qsort(weightarray, size, sizeof(float), floatcomp);
  mytotal=0.0; ii=0; iii=0;
  /* look through all the points, deciding how many times to sample them */
  for (i=0;i<points->num;i++) {
    mytotal += points->p[i].weight;
    /* check whether current point was sampled */
    if ((ii<size)&&(mytotal > weightarray[ii])) {
      sampl->p[iii].weight = 1.0;
      sampl->p[iii].coord = points->p[i].coord;
      ii++;
      /* check if this point was chosen again due to high weight */
      while ((ii < size)&&(mytotal > weightarray[ii])) {
	sampl->p[iii].weight += 1.0;
	ii++;
      }
      iii++;
    }
    /* this point will not appear in the sample */
    else {
      free(points->p[i].coord);
    }
  }
  sampl->num = iii;
  sampl->dim = points->dim;
  free(points->p);
  /* copy sampl over to points */
  points->num = sampl->num;
  points->dim = sampl->dim;
  points->p = sampl->p;
}

/* shuffle points into random order */
void shuffle(Points *points)
{
  long i, j;
  Point temp;
  for (i=0;i<points->num-1;i++) {
    j=(lrand48()%(points->num - i)) + i;
    temp = points->p[i];
    points->p[i] = points->p[j];
    points->p[j] = temp;
  }
}

/* shuffle an array of integers */
void intshuffle(int *intarray, int length)
{
  long i, j;
  int temp;
  for (i=0;i<length;i++) {
    j=(lrand48()%(length - i))+i;
    temp = intarray[i];
    intarray[i]=intarray[j];
    intarray[j]=temp;
  }
}


/* compute Euclidean distance squared between two points */
float dist(Point p1, Point p2, int dim)
{
  int i;
  float result=0.0;
  for (i=0;i<dim;i++)
    result += (p1.coord[i] - p2.coord[i])*(p1.coord[i] - p2.coord[i]);
  return(result);
}


/* run speedy on the points, return total cost of solution */
float speedy(Points *points, float z, long *k)
{
  long i, ii, closest;
  long *centers;
  float mindist, distance;
  float totalcost;

#ifdef PRINTINFO
  fprintf(stderr, "Speedy: facility cost %lf\n", z);
#endif
  centers=(long *)malloc(sizeof(long)*points->num);
  /* create center at first point, send it to itself */
  centers[0] = 0; *k=1;
  points->p[0].assign=0; points->p[0].cost = 0.0;
  totalcost = z;
  for (i=1;i<points->num;i++) {
    /* find closest center */
    closest=0; mindist=dist(points->p[i],points->p[0],points->dim);
    for (ii=1;ii<*k;ii++) {
      distance=dist(points->p[i], points->p[centers[ii]],points->dim);
      if (distance < mindist) {
	mindist=distance; closest=ii;
      }
    }
    /* decide whether to open new facility */
    if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) {
      centers[(*k)++] = i;
      points->p[i].assign=i; points->p[i].cost = 0.0;
      totalcost+=z;
    }
    else {
      points->p[i].assign=centers[closest];
      points->p[i].cost = mindist*points->p[i].weight;
      totalcost += points->p[i].cost;
    }
  }
#ifdef PRINTINFO
  fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n",
	 *k, totalcost);
  fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k));
#endif
  free(centers);
  return(totalcost);
}

/* run speedy on the points, return total cost of solution */
float speedy2(Points *points, float z, long *k, long klim)
{
  long i, ii, closest;
  long *centers;
  float mindist, distance;
  float totalcost;

#ifdef PRINTINFO
  fprintf(stderr, "Speedy: facility cost %lf\n", z);
#endif
  centers=(long *)malloc(sizeof(long)*points->num);
  /* create center at first point, send it to itself */
  centers[0] = 0; *k=1;
  points->p[0].assign=0; points->p[0].cost = 0.0;
  totalcost = z;
  for (i=1;i<points->num;i++) {
    /* find closest center */
    closest=0; mindist=dist(points->p[i],points->p[0],points->dim);
    for (ii=1;ii<*k;ii++) {
      distance=dist(points->p[i], points->p[centers[ii]],points->dim);
      if (distance < mindist) {
	mindist=distance; closest=ii;
      }
    }
    /* decide whether to open new facility */
    if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) {
      centers[(*k)++] = i;
      points->p[i].assign=i; points->p[i].cost = 0.0;
      totalcost+=z;
    }
    else {
      points->p[i].assign=centers[closest];
      points->p[i].cost = mindist*points->p[i].weight;
      totalcost += points->p[i].cost;
    }
    if (*k > klim) {
      free(centers);
      return(totalcost);
    }
  }
#ifdef PRINTINFO
  fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n",
	 *k, totalcost);
  fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k));
#endif
  free(centers);
  return(totalcost);
}


/* For a given point x, find the cost of the following operation:
 * -- open a facility at x if there isn't already one there,
 * -- for points y such that the assignment distance of y exceeds dist(y, x),
 *    make y a member of x,
 * -- for facilities y such that reassigning y and all its members to x 
 *    would save cost, realize this closing and reassignment.
 * 
 * If the cost of this operation is negative (i.e., if this entire operation
 * saves cost), perform this operation and return the amount of cost saved;
 * otherwise, do nothing.
 */

/* numcenters will be updated to reflect the new number of centers */
/* z is the facility cost, x is the number of this point in the array 
   points */

float gain(long x, Points *points, float z, long *numcenters)
{
  int i;
  int number_of_points = points->num;
  int number_of_centers_to_close = 0;

  // for each point y, will y switch membership to x?
  int *switch_membership = (int *) malloc(sizeof(int) * number_of_points); 
  for ( i = 0; i < number_of_points; i++ ) {
    switch_membership[i] = 0;
  }

  // for each point y, if y is a median, how much will it save by closing
  // and shifting itself and its members to x?
  float *lower = (float *) malloc(sizeof(int) * number_of_points);
  for ( i = 0; i < number_of_points; i++ ) {
    lower[i] = 0;
  }

  // We will calculate the cost of opening a median at x
  float cost_of_opening_x;

  if ( points->p[x].assign == x ) {
    // x is already a median; opening it doesn't cost an additional z
    cost_of_opening_x = 0;
  }
  else {
    cost_of_opening_x = z;
  }


  // every median would save ("lower" its cost by) z if it could close
  // and reassign itself and its members to x

  for ( i = 0; i < number_of_points; i++ ) {
    lower[points->p[i].assign] = z;
  }

  for ( i = 0; i < number_of_points; i++ ) {

    float x_cost = dist(points->p[i], points->p[x], points->dim) 
                   * points->p[i].weight;
    float current_cost = 
      dist(points->p[i], points->p[points->p[i].assign], points->dim)
      * points->p[i].weight;


    if ( x_cost < current_cost ) {

      // point i would save cost just by switching to x
      // (note that i cannot be a median, 
      // or else dist(p[i], p[x]) would be 0)
      
      switch_membership[i] = 1;
      cost_of_opening_x += x_cost - current_cost;

    } else {

      // cost of assigning i to x is at least current assignment cost of i

      // consider the savings that i's **current** median would realize
      // if we reassigned that median and all its members to x;
      // note we've already accounted for the fact that the median
      // would save z by closing; now we have to subtract from the savings
      // the extra cost of reassigning that median and its members 

      lower[points->p[i].assign] += current_cost - x_cost;

    }
  
  }

  // at this time, we can calculate the cost of opening a center
  // at x; if it is negative, we'll go through with opening it

  for ( i = 0; i < number_of_points; i++ ) {
    if ( lower[i] > 0 ) {
      // i is a median, and
      // if we were to open x (which we still may not) we'd close i


      // note, we'll ignore the following quantity unless we do open x
      ++number_of_centers_to_close;  

      cost_of_opening_x -= lower[i];
    }
  }

  // now, check whether opening x would save cost; if so, do it,
  // and otherwise do nothing

  if ( cost_of_opening_x < 0 ) {
    //  we'd save money by opening x; we'll do it
    for ( i = 0; i < number_of_points; i++ ) {

      if ( switch_membership[i] || lower[points->p[i].assign] > 0 ) {
	// Either i's median (which may be i itself) is closing,
	// or i is closer to x than to its current median

	points->p[i].cost = points->p[i].weight *
                            dist(points->p[i], points->p[x], points->dim);
	points->p[i].assign = x;
      }
    }

    *numcenters = *numcenters + 1 - number_of_centers_to_close;
  }
  else {
    cost_of_opening_x = 0;  // the value we'll return
  }

  free(switch_membership);
  free(lower);
  return -cost_of_opening_x;
  
}

/* facility location on the points using local search */
/* z is the facility cost, returns the total cost and # of centers */
/* assumes we are seeded with a reasonable solution */
/* cost should represent this solution's cost */
/* halt if there is < e improvement after iter calls to gain */
/* feasible is an array of numfeasible points which may be centers */

float FL(Points *points, int *feasible, int numfeasible,
	 float z, long *k, double cost, long iter, float e)
{
  long i;
  long x;
  double change;
  long numberOfPoints;

#ifdef PRINTINFO
  fprintf(stderr, "%d centers, cost %lf, total distance %lf\n",
	 *k, cost, cost - z*(*k));
#endif
  change = -cost;
  /* continue until we run iter iterations without improvement */
  /* stop instead if improvement is less than e */
  while (change/cost < -1.0*e) {
    change = 0.0;
    numberOfPoints = points->num;
    /* randomize order in which centers are considered */


    intshuffle(feasible, numfeasible);
    for (i=0;i<iter;i++) {
      if (iter >= numfeasible)
	x = i%numfeasible;
      else
	x = lrand48()%numfeasible;
      change += gain(feasible[x], points, z, k);
    }

    cost += change;
#ifdef PRINTINFO
    fprintf(stderr, "%d centers, cost %lf, total distance %lf\n",
	   *k, cost, cost - z*(*k));
#endif
  }
  return(cost);
}

/* set feasible to be an array of feasible centers */
/* return number of feasible centers */
int selectfeasible(Points *points, int **feasible, int kmin)
{
  int i, ii;
  int numfeasible;
  float *feasweight;
  float totalweight;
  float mytotal;

  numfeasible = points->num;
  if (numfeasible > (ITER*kmin*log(kmin)))
    numfeasible = (int)(ITER*kmin*log(kmin));
  *feasible = (int *)malloc(numfeasible*sizeof(int));
  /* not many points, all will be feasible */
  if (numfeasible == points->num) {
    for (i=0;i<points->num;i++)
      (*feasible)[i] = i;
  }
  else {
    /* choose feasible points at random */
    totalweight = 0.0;
    for (i=0;i<points->num;i++)
      totalweight+=points->p[i].weight;
    feasweight = (float *)malloc(numfeasible*sizeof(float));
    for (i=0;i<numfeasible;i++)
      feasweight[i] = (lrand48()/(float)INT_MAX)*totalweight;
    qsort(feasweight, numfeasible, sizeof(float), floatcomp);
    ii=0; mytotal=0.0;
    for (i=0;i<points->num;i++) {
      mytotal+=points->p[i].weight;
      while ((ii < numfeasible)&&(mytotal > feasweight[ii]))
        (*feasible)[ii++] = i;
    }
    if (ii != numfeasible) fprintf(stderr, "Error in Feasible Centers\n");
    free(feasweight);
  }

  return(numfeasible);
}
/* compute approximate kmedian on the points */
float kmedian(Points *points, long kmin, long kmax)
{
  int i;
  float cost;
  float lastcost;
  float hiz, loz, z;
  long k;
  int flag;      
  int *feasible;
  int numfeasible;
  hiz = loz = 0.0;
  long numberOfPoints = points->num;
  long ptDimension = points->dim;

#ifdef PRINTINFO
  fprintf(stderr, "Starting Kmedian procedure\n");
  fprintf(stderr, "%d points in %d dimensions\n", numberOfPoints, ptDimension);
#endif

  for (i=0;i<numberOfPoints;i++)
    hiz += dist(points->p[i], points->p[0],
		ptDimension)*points->p[i].weight;
  loz=0.0; z = (hiz+loz)/2.0;

  shuffle(points);
  cost = speedy2(points, 0.001, &k, kmax);
  if (k <= kmax) {
    return(cost);
  }

  shuffle(points);
  cost = speedy(points, z, &k);

#ifdef PRINTINFO
  fprintf(stderr, "Finished first call to speedy\n");
#endif
  i=0;
  /* NEW: Check whether more centers than points! */
  if (points->num <= kmax) {
    /* just return all points as facilities */
    for (i=0;i<points->num;i++) {
      points->p[i].assign = i;
      points->p[i].cost = 0;
      return(0.0);
    }
  }
  /* give speedy SP chances to get at least kmin/2 facilities */
  while ((k < kmin)&&(i<SP)) {
    cost = speedy(points, z, &k);
    i++;
  }
  /* if still not enough facilities, assume z is too high */
  while (k < kmin) {
#ifdef PRINTINFO
    fprintf(stderr, "%lf %lf\n", loz, hiz);
    fprintf(stderr, "Speedy indicates we should try lower z\n");
#endif
    if (i >= SP) {hiz=z; z=(hiz+loz)/2.0; i=0;}
    shuffle(points);
    cost = speedy(points, z, &k);
    i++;
  }
  /* now we begin the binary search for real */
  /* must designate some points as feasible centers */
  /* this creates more consistancy between FL runs */
  /* helps to guarantee correct # of centers at the end */
  numfeasible = selectfeasible(points, &feasible, kmin); 

  while(1) {
#ifdef PRINTINFO
    fprintf(stderr, "loz = %lf, hiz = %lf\n", loz, hiz);
    fprintf(stderr, "Running Local Search...\n");
#endif
    /* first get a rough estimate on the FL solution */

    lastcost = cost;
    cost = FL(points, feasible, numfeasible,
	      z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.1);

    /* if number of centers seems good, try a more accurate FL */
    if (((k <= (1.1)*kmax)&&(k >= (0.9)*kmin))||
	((k <= kmax+2)&&(k >= kmin-2))) {

#ifdef PRINTINFO
      fprintf(stderr, "Trying a more accurate local search...\n");
#endif
      /* may need to run a little longer here before halting without
	 improvement */
      cost = FL(points, feasible, numfeasible,
		z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.001);
    }

    if (k > kmax) {
      /* facilities too cheap */
      /* increase facility cost and up the cost accordingly */
      loz = z; z = (hiz+loz)/2.0;
      cost += (z-loz)*k;
    }
    if (k < kmin) {
      /* facilities too expensive */
      /* decrease facility cost and reduce the cost accordingly */
      hiz = z; z = (hiz+loz)/2.0;
      cost += (z-hiz)*k;
    }

    /* if k is good, return the result */
    /* if we're stuck, just give up and return what we have */
    if (((k <= kmax)&&(k >= kmin))||((loz >= (0.999)*hiz)))
      { free(feasible); return(cost);}

  }
}

/* compute the means for the k clusters */
float contcenters(Points *points)
{
  long i, ii;
  float relweight;
  float cost;
  for (i=0;i<points->num;i++) {
    /* compute relative weight of this point to the cluster */
    if (points->p[i].assign != i) {
      relweight=points->p[points->p[i].assign].weight + points->p[i].weight;
      relweight = points->p[i].weight/relweight;
      for (ii=0;ii<points->dim;ii++) {
	points->p[points->p[i].assign].coord[ii]*=1.0-relweight;
	points->p[points->p[i].assign].coord[ii]+=
	  points->p[i].coord[ii]*relweight;
      }
      points->p[points->p[i].assign].weight += points->p[i].weight;
    }
  }
  cost = 0.0;
  for (i=0;i<points->num;i++)
    cost += dist(points->p[i], points->p[points->p[i].assign],
		 points->dim)*points->p[i].weight;
  return(cost);
}


/* print the centers to a file of a given name */

void outcenters(Points *points, char *outname)
{
  long i, ii;
  long k;

  int *is_a_median = (int *) malloc(points->num * sizeof(int));
  for ( i = 0; i < points->num; i++) {
    is_a_median[i] = 0;
  }
  printf("Phase I \n");
  /* mark the centers */
  for ( i = 0; i < points->num; i++ ) {
    is_a_median[points->p[i].assign] = 1;
  }

  k=0;

  /* count how many centers */
  for ( i = 0; i < points->num; i++ ) {
    if ( is_a_median[i] ) {
      k++;
    }
  }
  printf("Phase II \n");
  FILE *fpOut = fopen(outname, "w");

   printf("Phase III \n");
  /* output the centers */
  for ( i = 0; i < points->num; i++ ) {
    if ( is_a_median[i] ) {
      fprintf(fpOut, "%d", (int)points->p[i].weight);
      for ( ii = 0; ii < points->dim; ii++ ) {
	fprintf(fpOut, " %lf", points->p[i].coord[ii]);
      }
      fprintf(fpOut, "\n");
    }
  }
  fclose(fpOut);
  free(is_a_median);
}


/* copy the centers to a array, adjust the weights */

void copycenters(Points *points,Points *secondLevelPoints,long startindex,long *actualNumOfPoints)
{
  long i, ii,j;
  long k;

  int *is_a_median = (int *) malloc(points->num * sizeof(int));
  for ( i = 0; i < points->num; i++) {
    is_a_median[i] = 0;
  }

  /* mark the centers */
  for ( i = 0; i < points->num; i++ ) {
    is_a_median[points->p[i].assign] = 1;
  }

  k=0;

  /* count how many centers */
  for ( i = 0; i < points->num; i++ ) {
    if ( is_a_median[i] ) {
      k++;
    }
  }
  *actualNumOfPoints += k;
  
  j = startindex;
  /* copy the centers */
  for ( i = 0; i < points->num; i++ ) {
    if ( is_a_median[i] ) {
      //fprintf(fpOut, "%d", (int)points->p[i].weight);
      secondLevelPoints->p[j].weight = points->p[i].weight;
      for ( ii = 0; ii < points->dim; ii++ ) {
	//fprintf(fpOut, " %lf", points->p[i].coord[ii]);
	secondLevelPoints->p[j].coord[ii] = points->p[i].coord[ii];
      }
      j++;
    }
  }
  free(is_a_median);
}

int main(int argc, char **argv)
{
  char *outfilename = new char[MAXNAMESIZE];
  long kmin, kmax, k, dimension, N, samplesize;
  long m;
  long i;
  long actualSecondLevelPoints;

  Points points, sampl;
  Points secondLevelPoints;

  // Note: 00.12.11: (Liadan) Note that I've changed command line:
  // It's now a.out k1 k2 N d inputfile samplesize

  // Note: I changed that command line again (craupach). Additional parameter m for streaming

  int seed;
  if (SEED)
    getSeed(&seed);

  ifstream ifile;
  ifile.open(argv[5]);


  if ( readcommands(argc, argv, &kmin, &kmax, &points, &sampl, &samplesize,
		    &outfilename) == 0 ) {

    exit(0);
  } else {
	kmin = atoi(argv[1]);
  	kmax = atoi(argv[2]);
  	//strcpy(outfile, argv[7]);
  	points.num = atoi(argv[8]);
  	points.dim = atoi(argv[4]);
  	
  	samplesize = atoi(argv[6]);
	m = atoi(argv[8]);
  	N = atoi(argv[3]);
  }
  //init the bucket for the incoming stream
  points.p = (Point *)malloc(points.num*sizeof(Point));
  for(i=0;i<points.num;i++) {
    points.p[i].coord = (float *)malloc(points.dim*sizeof(float));
  }

  //init the second level
  actualSecondLevelPoints = 0;
  secondLevelPoints.dim = points.dim;
  secondLevelPoints.num = 2 * kmax * (N / m);
  secondLevelPoints.p   = (Point *)malloc(secondLevelPoints.num*sizeof(Point));
  for(i=0;i<secondLevelPoints.num;i++) {
    secondLevelPoints.p[i].coord = (float *)malloc(secondLevelPoints.dim*sizeof(float));
  }
  dimension = points.dim;
  //stream in the points
  for(i=0;i<(N / m);i++){
	//read the next batch of points
	printf("start loop \n");
	readpoints(&ifile, &points);
	printf("points read \n");
	kmedian(&points, kmin, kmax);
        contcenters(&points);
	long startindex = actualSecondLevelPoints; //just to avoid confusion by giving the variable as a parameter once as a pointer and once normal 
	printf("finished computing centers.... copying \n");
	copycenters(&points,&secondLevelPoints,startindex,&actualSecondLevelPoints);
	printf("completed batch %d with %d centres \n",i,actualSecondLevelPoints);
	printf("Points num: %d %d %d \n \n",points.num,m,i);
  }

  printf("starting the actual clustering\n");

 
  secondLevelPoints.num = actualSecondLevelPoints;
   //debug: print out the second level points
  for(i=0;i<secondLevelPoints.num;i++){
	printf("Point %d: First Coordinate %f, Weight %f \n",i,secondLevelPoints.p[i].coord[0],secondLevelPoints.p[i].weight);
  }

  kmedian(&secondLevelPoints, kmin, kmax);
  printf("kMedian\n");
  contcenters(&secondLevelPoints);
  printf("contcenters\n");
  outcenters(&secondLevelPoints, argv[7]);
  printf("finished\n");

}
